data <- read.csv("/Users/macbookpro/Desktop/Crow/data/wine_df.csv")
data$X <- NULL
head(data)
##   alcohol malic_acid  ash alcalinity_of_ash magnesium total_phenols flavanoids
## 1   14.23       1.71 2.43              15.6       127          2.80       3.06
## 2   13.20       1.78 2.14              11.2       100          2.65       2.76
## 3   13.16       2.36 2.67              18.6       101          2.80       3.24
## 4   14.37       1.95 2.50              16.8       113          3.85       3.49
## 5   13.24       2.59 2.87              21.0       118          2.80       2.69
## 6   14.20       1.76 2.45              15.2       112          3.27       3.39
##   nonflavanoid_phenols proanthocyanins color_intensity  hue
## 1                 0.28            2.29            5.64 1.04
## 2                 0.26            1.28            4.38 1.05
## 3                 0.30            2.81            5.68 1.03
## 4                 0.24            2.18            7.80 0.86
## 5                 0.39            1.82            4.32 1.04
## 6                 0.34            1.97            6.75 1.05
##   od280.od315_of_diluted_wines proline
## 1                         3.92    1065
## 2                         3.40    1050
## 3                         3.17    1185
## 4                         3.45    1480
## 5                         2.93     735
## 6                         2.85    1450
summary(data)
##     alcohol        malic_acid         ash        alcalinity_of_ash
##  Min.   :11.03   Min.   :0.740   Min.   :1.360   Min.   :10.60    
##  1st Qu.:12.36   1st Qu.:1.603   1st Qu.:2.210   1st Qu.:17.20    
##  Median :13.05   Median :1.865   Median :2.360   Median :19.50    
##  Mean   :13.00   Mean   :2.336   Mean   :2.367   Mean   :19.49    
##  3rd Qu.:13.68   3rd Qu.:3.083   3rd Qu.:2.558   3rd Qu.:21.50    
##  Max.   :14.83   Max.   :5.800   Max.   :3.230   Max.   :30.00    
##    magnesium      total_phenols     flavanoids    nonflavanoid_phenols
##  Min.   : 70.00   Min.   :0.980   Min.   :0.340   Min.   :0.1300      
##  1st Qu.: 88.00   1st Qu.:1.742   1st Qu.:1.205   1st Qu.:0.2700      
##  Median : 98.00   Median :2.355   Median :2.135   Median :0.3400      
##  Mean   : 99.74   Mean   :2.295   Mean   :2.029   Mean   :0.3619      
##  3rd Qu.:107.00   3rd Qu.:2.800   3rd Qu.:2.875   3rd Qu.:0.4375      
##  Max.   :162.00   Max.   :3.880   Max.   :5.080   Max.   :0.6600      
##  proanthocyanins color_intensity       hue         od280.od315_of_diluted_wines
##  Min.   :0.410   Min.   : 1.280   Min.   :0.4800   Min.   :1.270               
##  1st Qu.:1.250   1st Qu.: 3.220   1st Qu.:0.7825   1st Qu.:1.938               
##  Median :1.555   Median : 4.690   Median :0.9650   Median :2.780               
##  Mean   :1.591   Mean   : 5.058   Mean   :0.9574   Mean   :2.612               
##  3rd Qu.:1.950   3rd Qu.: 6.200   3rd Qu.:1.1200   3rd Qu.:3.170               
##  Max.   :3.580   Max.   :13.000   Max.   :1.7100   Max.   :4.000               
##     proline      
##  Min.   : 278.0  
##  1st Qu.: 500.5  
##  Median : 673.5  
##  Mean   : 746.9  
##  3rd Qu.: 985.0  
##  Max.   :1680.0

Features are in much different scales. We have to scale the data before proceeding to clustering.

# Scale data.
data_scaled <- scale(data)

# Calculate PCA.
pca <- data.frame(prcomp(data_scaled)$x)

Let’s run repeated stochastic clustering using kmeans. We will run the algorithm 30 times, check it’s clustering stability over repeated iterations, and get the majority voting label for each sample.

Will set the number of clusters to 5 for this first experiment.

scr <- cRowflow::stochastic_clustering_runner(
  data_scaled,
  kmeans,
  labels_name = "cluster",
  n_runs = 30,
  centers=5
)

Let’s visualise the majority voting labels returned by cRowflow.

create_3d_plot_pca(pca, scr$majority_voting_labels, T, title_addition = " - Clusters")

Let’s now examine the stability of each element over the repeated iterations.

summary(scr$ecc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3384  0.5060  0.6782  0.6455  0.7779  0.8668

The mean ECC is 0.65 and median is 0.68.

create_3d_plot_pca(pca, scr$ecc, title_addition = " - ECC")

The most unstable points are the ones at the boundaries between clusters 2, 3, and 5.

We can use cRowflow’s parameter_searcher function to find optimal clustering parameter values that result in reproducible and robust clustering results.

param_grid = list(centers=seq(3,8), algorithm = c("Hartigan-Wong", "Lloyd", "Forgy", "MacQueen"))

parameter_searcher <- cRowflow::parameter_searcher(
  data_scaled,
  kmeans,
  labels_name = "cluster",
  param_grid = param_grid,
  n_runs = 30,
  iter.max = 30
)

Let’s visualise the results when changing the number of clusters and the algorithm implementation.

cRowflow::plot_heatmap(parameter_searcher$results_df, "centers", "algorithm")
## Selected keys for visualization: centers, algorithm
## Creating DataFrame from parameter search results...
## No duplicates found. Proceeding without aggregation.

The heatmap displays the median ECC values for different combinations of k-means clustering algorithms and the number of cluster centers. The results indicate that lower center values (e.g., 3, 4) yield higher ECC values, with the Hartigan-Wong implementation producing more stable results in all cases. In contrast, higher center values (e.g., 8) lead to lower ECC scores across all algorithms.

Let’s further optimize clustering stability when the number of clusters (centers) is set to 4 and the implementation is Hartigan-Wong by identifying the optimal feature subset using a genetic algorithm. The current median ECC for that configuration is 0.79.

set.seed(42)
genetic_fs <- cRowflow::genetic_algorithm_feature_selector(
  data_scaled,
  kmeans,
  labels_name = "cluster",
  verbose = T,
  n_generations_no_change = 5,
  centers=4,
  algorithm = "Hartigan-Wong",
  iter.max=30
)
## Gen 0 - Best ECC: 0.9567
## Gen 1 - Best ECC: 0.9567
## Gen 2 - Best ECC: 0.9678
## Gen 3 - Best ECC: 0.9678
## Gen 4 - Best ECC: 0.9678
## Gen 5 - Best ECC: 0.9678
## Gen 6 - Best ECC: 0.9678
## Gen 7 - Best ECC: 0.9678
cRowflow::plot_ga_fitness_evolution(genetic_fs$history)

genetic_fs_best_fitness <- genetic_fs$best_fitness_scr_result
create_3d_plot_pca(pca, genetic_fs_best_fitness$ecc, title_addition = " - ECC AFTER GA OPTIMIZATION")
create_3d_plot_pca(pca, genetic_fs_best_fitness$majority_voting_labels, categorical = T, title_addition = " - CLUSTERS")

By keeping only 7 of the features we were able to produce a much more stable clustering with the same configurations (centers=4, algorithm = “Hartigan-Wong”, iter.max=30).

colnames(data_scaled[,genetic_fs$best_features])
## [1] "alcohol"           "ash"               "alcalinity_of_ash"
## [4] "total_phenols"     "flavanoids"        "hue"              
## [7] "proline"

We can